/*******************************************************************************

	"Road Endpoints and City Sizes"
	
	Replication Package
	
	Empirical Analysis Do-File

*******************************************************************************/

clear

*set the folder of this do-file as the working directory:
cd "WRITE HERE"

*set the folder with the working data:
global workingdata "Working Data"	

/*install packages used in this project:
ssc install shapley2
ssc install acreg
ssc install psacalc
ssc install geonear
ssc install geodist
ssc install boottest
ssc install ranktest
ssc install ivreg2
*/

*set scheme for figures:
set scheme s1color

********************************************************************************
*PREPARE VARIABLES
********************************************************************************

*import the data:
use "$workingdata/mun_data.dta", clear

*generate the state:
destring cod_municipio, gen(aux)
gen state = floor(aux/100000)
tab state
drop aux

*generate a dummy if endpoint time>=1:
gen endpoint_dummy = 0 if time_as_endpoint!=.
replace endpoint_dummy = 1 if time_as_endpoint>0 & time_as_endpoint!=.
la var endpoint_dummy "Endpoint dummy"

*generate IHS of time as endpoint:
gen ihstime_as_endpoint = log(time_as_endpoint+(time_as_endpoint^2+1)^0.5)
la var ihstime_as_endpoint "IHS years as endpoint"

*generate the logarithm of distance to Luz;
gen ldist_to_luz = log(dist_to_luz)
la var ldist_to_luz "Log distance to São Paulo"

	*set baseline controls:
	global controls "year ldist_to_luz preexistent_mun"
	
*generate dependent variables:

	*log of urban population:
	gen lpop_urban = log(pop_urban)
	*log of distance next:
	gen ldist_next = log(dist_next)
	*log of distance previous:
	gen ldist_previous = log(dist_previous)	
	*log of urban gdp:
	gen lgdp_urban = log(man_va+serv_va+gov_va)
	gen gdp_urban = man_va+serv_va+gov_va
	*log per capita gdp:
	gen pcgdp = (gdp/pop)
	gen lpcgdp = lgdp - lpop
	
	global LHS "lpop_urban lgdp_urban ldist_next ldist_previous"
	
*prepare some geographic fundamentals:

	*logs of the potential yields and malaria days:
	foreach c in coffee maize sugar malaria {
	gen l`c'_mean = log(`c'_mean)
	}

*drop endpoints, the crossroad of Bauru, and São Paulo:
drop if endpoint==1
drop if dist_to_luz==0

********************************************************************************
*ADD 1872 CENSUS VARIABLES
********************************************************************************

*merge with the AMCs:
gen UFMUN = cod_municipio_6dt
merge 1:1 UFMUN using "$workingdata/amc_historic.dta"
keep if _m==3
drop area1872 estado mun UFMUN _merge

	*check the number of AMCs (to answer referee #1):
	codebook amc*

*now we merge with 1872 information:
merge m:1 amc1872 using "$workingdata/census1872.dta"
keep if _m==3
drop _m

*we may generate the 1872 pop. density:
gen lpop_density1872=log(pop_density1872)
drop pop_density1872

********************************************************************************
*ADD HISTORICAL POPULATION DENSITY
********************************************************************************

*identify the AMC of previous arrival:
gen amc_year=.
replace amc_year=1872 if year>=1872
replace amc_year=1920 if year>=1920
replace amc_year=1940 if year>=1940
replace amc_year=1960 if year>=1960
	tab amc_year
	
*identify the previous census year:
gen census_year=.
replace census_year=1872 if year>=1872
replace census_year=1890 if year>=1890
replace census_year=1900 if year>=1900
replace census_year=1920 if year>=1920
replace census_year=1940 if year>=1940
replace census_year=1950 if year>=1950
replace census_year=1960 if year>=1960
	tab census_year
	
*now we merge:
gen amc=""
foreach v in 1872 1920 1940 1960 {
replace amc=amc`v' if amc_year==`v'
}	
merge m:1 amc_year amc using "$workingdata/pastpopdensity.dta"
drop if _m==2
drop _m

*now we generate the pop density at arrival:
gen hist_pop_density=.
replace hist_pop_density=pop_density1872 + (pop_density1890-pop_density1872)*((year-census_year)/(1890-census_year)) if census_year==1872
replace hist_pop_density=pop_density1890 + (pop_density1900-pop_density1890)*((year-census_year)/(1900-census_year)) if census_year==1890
replace hist_pop_density=pop_density1900 + (pop_density1920-pop_density1900)*((year-census_year)/(1920-census_year)) if census_year==1900
replace hist_pop_density=pop_density1920 + (pop_density1940-pop_density1920)*((year-census_year)/(1940-census_year)) if census_year==1920
replace hist_pop_density=pop_density1940 + (pop_density1950-pop_density1940)*((year-census_year)/(1950-census_year)) if census_year==1940
replace hist_pop_density=pop_density1950 + (pop_density1960-pop_density1950)*((year-census_year)/(1960-census_year)) if census_year==1950
replace hist_pop_density=pop_density1960 + (pop_density1970-pop_density1960)*((year-census_year)/(1970-census_year)) if census_year==1960
gen lhist_pop_density=log(hist_pop_density)
la var lhist_pop_density "Population density at railroad arrival"
foreach v in 1872 1890 1900 1920 1940 1950 1960 1970 {
drop pop_density`v'
}
drop amc amc1872 amc1920 amc1940 amc1960
drop amc_year

********************************************************************************
*ADD HISTORICAL MIGRATION CONTROLS
********************************************************************************

*merge with cumulative up to that year:
merge m:1 year time_as_endpoint using "$workingdata/cum_migration.dta"
keep if _m==3
drop _m

*set as multiples of 100,000s:
replace migration_tot = migration_tot/(100000)

********************************************************************************
*SUMMARY STATISTICS (TABLE 1)
********************************************************************************

*change some labels:
la var pop "Population"
gen pop_ths = pop/1000
gen pop_density = pop/mun_area
la var pop_density "Population density"
la var pcgdp "Per capita GDP (R\$, '000s)"
la var pop_ths "Population ('000s)"
la var pop_urban "Urban population"
gen pop_urban_ths = pop_urban/1000
la var pop_urban_ths "Urban population ('000s)"
gen urb_rate = pop_urban/pop
la var urb_rate "Urbanization rate"
la var dist_to_luz "Distance to Luz (km)"
la var alt_mean "Altitude (m)"
la var tri_mean "Terrain ruggedness index (m)"
la var dist_to_luz "Distance to São Paulo (km)"
la var endpoint_dummy "Years as endpoint $\ge 1$"
la var dist_next "Distance to next municipality (km)"
la var terraroxa "Share of \textit{terra roxa}"
la var latosol "Share of latosols"
la var acrisol "Share of acrisols"
la var river "Any main river"
la var malaria_mean "Malaria suitability (days a year)"
la var coffee_mean "Potential yield (ton/ha) - coffee"
la var maize_mean "Potential yield (ton/ha) - maize"
la var sugar_mean "Potential yield (ton/ha) - sugarcane"
la var ycoord "Latitude"
la var xcoord "Longitude"

*adjust pc GDP with 2010 PPP dollars:
gen pcgdp_dollars = pcgdp*(1.386/1.759)
la var pcgdp_dollars "Per capita GDP (2010 PPP US\$, '000s)"

*overall sum stats:
global y_sum "time_as_endpoint endpoint_dummy pop_ths pop_density pop_urban_ths urb_rate pcgdp_dollars dist_to_luz dist_next year found_year preexistent_mun alt_mean tri_mean river malaria_mean coffee_mean maize_mean sugar_mean terraroxa latosol acrisol ycoord xcoord"

		sum $y_sum	

*and also get aggregate urban population (cited in paper, over 13 million inhabitants):
preserve
gen aux = 1
collapse (rawsum) pop_urban, by(aux)
	*replace pop by million:
	replace pop_urban = pop_urban/(10^6)
sum 
restore

********************************************************************************
*BALANCE TEST (TABLE A.1)
********************************************************************************
	
*baseline controls and fundamentals:
foreach v in year ldist_to_luz preexistent_mun alt_mean tri_mean lcoffee_mean lmaize_mean lsugar_mean river lmalaria_mean terraroxa latosol acrisol ycoord xcoord {
acreg `v' time_as_endpoint, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*balance with respect to 1872 conditions:
foreach v in lpop_density1872 sh_slave1872 sh_foreign1872 sh_literate1872 sh_school1872 sh_nonag1872 lhist_pop_density {
	acreg `v' time_as_endpoint if year>=1872, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

********************************************************************************
*TIME AS ENDPOINT VERSUS YEAR (FIGURE A1)
********************************************************************************

preserve
collapse (mean) time_as_endpoint, by(year)

graph twoway (scatter time_as_endpoint year, msymbol(o) mcolor(navy)) ///
, ytitle("Mean time as endpoint") xtitle("Railroad arrival year")  plotr(style(none)) ///
xlabel(1870(10)1970) ylabel(0(2)12) xmtick(1865(5)1970) legend()

restore

********************************************************************************
*GRAPH ON RAILROAD ARRIVAL VERSUS INCORPORATION YEAR (FIGURE A2)
********************************************************************************

graph twoway (scatter found_year year if preexistent_mun==1, msymbol(oh) mcolor(navy)) ///
(scatter found_year year if preexistent_mun==0, msymbol(o) mcolor(red)) ///
(line year year, lcolor(black) lwidth(thin)) ///
, ytitle("Incorporation year") xtitle("Railroad arrival year")  plotr(style(none)) ///
xlabel(1870(10)1970) ylabel(1650(50)2000) xmtick(1865(5)1970) ///
legend(order(1 2) lab(1 "Incorporated before railroad arrival") lab(2 "Incorporated after railroad arrival"))

********************************************************************************
*MAIN OLS RESULTS (TABLE 2)
********************************************************************************

*without controls:
foreach v in $LHS {
di "Variable: `v'"
acreg `v' time_as_endpoint, spatial latitude(ycoord) longitude(xcoord) dist(50)
}	

*baseline controls:
foreach v in $LHS {
di "Variable: `v'"
acreg `v' time_as_endpoint $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*in levels:
gen pop_urban_level = pop_urban/1000
gen gdp_urban_level = (gdp_urban/1000)
gen dist_next_level = dist_next
gen dist_previous_level = dist_previous
	sum *_level
foreach v in pop_urban_level gdp_urban_level dist_next_level dist_previous_level {
acreg `v' time_as_endpoint $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*add controls for geographic fundamental:
gen alt_mean_sq = alt_mean^2
gen tri_mean_sq = tri_mean^2
global fund_controls "alt_mean alt_mean_sq tri_mean tri_mean_sq lcoffee_mean lmaize_mean lsugar_mean lmalaria_mean river terraroxa acrisol latosol"
foreach v in $LHS {
	di "Variable: `v'"
	acreg `v' time_as_endpoint $controls $fund_controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
	}
	
*add also quadratic polynomial of latitude and longitude:
gen xcoord_sq = xcoord^2
gen ycoord_sq = ycoord^2
gen xcoordXycoord = xcoord*ycoord
global coord_controls "xcoord ycoord xcoord_sq ycoord_sq xcoordXycoord"
foreach v in $LHS {
	di "Variable: `v'"
	acreg `v' time_as_endpoint $controls $fund_controls $coord_controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
	}
	
*find lower bounds under proportional selection (note the R2max are 1.3 of the R2 above)
	reg lpop_urban time_as_endpoint $controls $fund_controls $coord_controls
	psacalc beta time_as_endpoint, rmax(0.632) delta(1)
	reg lgdp_urban time_as_endpoint $controls $fund_controls $coord_controls
	psacalc beta time_as_endpoint, rmax(0.676) delta(1) 
	reg ldist_next time_as_endpoint $controls $fund_controls $coord_controls	 
	psacalc beta time_as_endpoint, rmax(0.542) delta(1)
		
	
********************************************************************************
*SHAPLEY VALUE DECOMPOSITION (TABLE A20)
********************************************************************************

foreach v in $LHS {
reg `v' time_as_endpoint $controls $fund_controls $coord_controls, vce(robust)
shapley2 , stat("r2") group("time_as_endpoint, year ldist_to_luz preexistent_mun, alt_mean alt_mean_sq tri_mean tri_mean_sq lcoffee_mean lmaize_mean lsugar_mean lmalaria_mean river terraroxa acrisol latosol, xcoord ycoord xcoord_sq ycoord_sq xcoordXycoord")
}


********************************************************************************
*ASSOCIATION GRAPHS (FIGURE 4)
********************************************************************************

*urban population:
graph twoway (scatter lpop_urban time_as_endpoint, msymbol(o) mcolor(gray)) ///
(lfit lpop_urban time_as_endpoint , lcolor(black)) ///
, ytitle("Log Urban Population") xtitle("Years as Endpoint")  plotr(style(none)) ///
legend(off)

*urban GDP:
graph twoway (scatter lgdp_urban time_as_endpoint, msymbol(o) mcolor(gray)) ///
(lfit lgdp_urban time_as_endpoint, lcolor(black)) ///
, ytitle("Log Urban GDP") xtitle("Years as Endpoint")  plotr(style(none)) ///
legend(off)

*distance to next town:
graph twoway (scatter ldist_next time_as_endpoint, msymbol(o) mcolor(gray)) ///
(lfit ldist_next time_as_endpoint, lcolor(black)) ///
, ytitle("Log Dist. Next Mun.") xtitle("Years as Endpoint")  plotr(style(none)) ///
legend(off)

*distance to previous town:	
graph twoway (scatter ldist_previous time_as_endpoint, msymbol(o) mcolor(gray)) ///
(lfit ldist_previous time_as_endpoint, lcolor(black)) ///
, ytitle("Log Dist. Previous Mun.") xtitle("Years as Endpoint")  plotr(style(none)) ///
legend(off)

********************************************************************************
*USE DISTRICT DATA (TABLE A10)
********************************************************************************

*now run at the district level:
preserve
use "$workingdata/dist_data.dta", clear

	*prepare:
	drop if dist_to_luz==0
	drop if endpoint==1
	
	gen ldist_to_luz = log(dist_to_luz)
	gen lpop_urban = log(pop_urban)
	gen ldist_next = log(dist_next)
	gen ldist_previous = log(dist_previous)	

	*regression:
	foreach v in lpop_urban ldist_next ldist_previous {
	di "Variable: `v'"
	acreg `v' time_as_endpoint year ldist_to_luz preexistent_mun, spatial latitude(ycoord) longitude(xcoord) dist(50)
	}
	
restore

********************************************************************************
*ROBUSTNESS TO TIME MEASURE (TABLE A4)
********************************************************************************

*and I can use a variety of cutoffs:
foreach dt in 1 2 5 10 {
gen time_as_endpoint_high = 0
replace time_as_endpoint_high = 1 if time_as_endpoint >= `dt'
la var time_as_endpoint_high "Time as endpoint $\ge$ cutoff"

	*and now I regress:
	di "Cutoff: `dt'"
	foreach v in $LHS  {
	di "Variable: `v'"
	acreg `v' time_as_endpoint_high $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
	}
	
	*and drop:
	drop time_as_endpoint_high
}

*use the IHS:
foreach v in $LHS  {
di "Variable: `v'"
acreg `v' ihstime_as_endpoint $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*use the quadratic polynomial:
gen time_as_endpoint_sq = time_as_endpoint^2
la var time_as_endpoint_sq "Square of years as endpoint"
foreach v in $LHS  {
di "Variable: `v'"
acreg `v' time_as_endpoint time_as_endpoint_sq $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*use both intensive margin + extensive margin:
gen ext_margin = 0
replace ext_margin = 1 if time_as_endpoint >= 1
la var ext_margin "Time as endpoint $\ge$ 1 year"
foreach v in $LHS  {
di "Variable: `v'"
acreg `v' ext_margin time_as_endpoint $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}
	
********************************************************************************
*ROBUSTNESS TO ALTERNATIVE MEASURES OF DISTANCE (TABLE A9)
********************************************************************************

*prepare:
foreach v in dist_next_min dist_next_linear dist_previous_linear {
gen l`v' = log(`v')
}	

*run the regressions:
foreach v in dist_next_min dist_next_linear dist_previous_linear {
di "Variable: `v'"
acreg l`v' time_as_endpoint $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}		

********************************************************************************
*FURTHER ROBUSTNESS (TABLE A5)
********************************************************************************

*control for yearly GDP growth:
foreach v in $LHS  {
di "Variable: `v'"
acreg `v' time_as_endpoint $controls gdp_growth, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*year fixed effects:
foreach v in $LHS  {
di "Variable: `v'"
acreg `v' time_as_endpoint ldist_to_luz preexistent_mun, spatial latitude(ycoord) longitude(xcoord) dist(50) pfe1(year) correctr2
}

*company FEs:
foreach v in $LHS  {
di "Variable: `v'"
acreg `v' time_as_endpoint $controls, spatial latitude(ycoord) longitude(xcoord) dist(50) pfe1(company_nr) correctr2
}

*fundamentals + both FEs ("all controls"):
foreach v in $LHS {
di "Variable: `v'"
xi: acreg `v' time_as_endpoint ldist_to_luz preexistent_mun $fund_controls $coord_controls, pfe1(year) pfe2(company_nr) spatial latitude(ycoord) longitude(xcoord) dist(50) correctr2
}

********************************************************************************
*ROBUSTNESS TO CONFOUNDERS (TABLE A.8)
********************************************************************************
	
*regress both the own and the previous time as endpoint:
acreg time_as_endpoint time_as_endpoint_previous $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
foreach v in $LHS {
di "Variable: `v'"
acreg `v' time_as_endpoint time_as_endpoint_previous $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}	

*control for migration flows:
acreg time_as_endpoint migration_tot $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
foreach v in $LHS  {
di "Variable: `v'"
acreg `v' time_as_endpoint $controls migration_tot, spatial latitude(ycoord) longitude(xcoord) dist(50)
}	

********************************************************************************
*ROBUSTNESS TO HISTORICAL CONTROLS (TABLE A.6)
********************************************************************************
	
*1872 controls:
global controls1872 "lpop_density1872 sh_*1872"
foreach v in $LHS  {
di "Variable: `v'"
acreg `v' time_as_endpoint $controls $controls1872 if year>=1872, spatial latitude(ycoord) longitude(xcoord) dist(50)
}		
	
*predicted regional population density:
foreach v in $LHS  {
di "Variable: `v'"
acreg `v' time_as_endpoint $controls lhist_pop_density if year>=1872, spatial latitude(ycoord) longitude(xcoord) dist(50)
}	

*all historic controls + fundamentals:
foreach v in $LHS  {
di "Variable: `v'"
acreg `v' time_as_endpoint ldist_to_luz preexistent_mun lhist_pop_density $controls1872 $fund_controls $coord_controls if year>=1872, spatial latitude(ycoord) longitude(xcoord) dist(50)
}	

********************************************************************************
*ROBUSTNESS TO SUB-SAMPLES (TABLE A.7)
********************************************************************************

*drop extreme time as endpoint observations:
tabstat time_as_endpoint, s(max p99 p95 p90)
foreach v in $LHS {
di "Drop top 1%"
acreg `v' time_as_endpoint year $controls if time_as_endpoint<=16, spatial latitude(ycoord) longitude(xcoord) dist(50)
	di "Drop top 5%"
acreg `v' time_as_endpoint $controls if time_as_endpoint<=9, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*already incorporated vs non-incorporated municipalities:
foreach v in $LHS {
di "Variable: `v', not already incorporated"
acreg `v' time_as_endpoint year ldist_to_luz if preexistent_mun==0, spatial latitude(ycoord) longitude(xcoord) dist(50)
di "Variable: `v', only already incorporated"
acreg `v' time_as_endpoint year ldist_to_luz if preexistent_mun==1, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

********************************************************************************
*EVIDENCE ON S.J. DO RIO PRETO (FIGURE 3)
********************************************************************************

*keep the Araraquarense:
preserve
keep if company_nr==5

*mark S.J. do Rio Preto:
gen riopreto = 0
replace riopreto = 1 if cod_municipio=="3549805"

sort dist_to_luz
*now we can have the graph:
 twoway (scatter pop_urban_ths dist_to_luz if riopreto==0, msymbol(o) mcolor(gray)  yaxis(1)) ///
 (scatter pop_urban_ths dist_to_luz if riopreto==1, msymbol(Sh) mcolor(black) yaxis(1)) ///
(line year dist_to_luz, lcolor(black) yaxis(2) connect(stepstair)) ///
, ytitle("Urban Population ('000s)", axis(1)) ytitle("Railroad Arrival Year", axis(2)) xtitle("Distance to São Paulo (km)")  plotr(style(none)) ///
legend(order(2 - 1 - 3) label(1 "Population, other EFA municipalities (left axis)") label(2 "Population, São José do Rio Preto (left axis)") label(3 "Railroad arrival year (right axis)") ) ///
xmtick( 302.7288 319.4442 325.0854 338.1857 354.7059 362.451 375.9318 387.4698 ///
397.2571 409.6532 425.8643 438.1269 443.0413 467.1143 478.79 488.5775 504.5241 ///
515.6219 526.6992 546.5347 559.0131 567.9726 583.759 600.4354 610.2067 614.3646 625.7399 634.8303 641.2776, ///
tlcolor(gray) tposition(inside) tlstyle(grid) ) ///
xlabel(300(100)600) ylabel(1900(10)1950, axis(2)) ylabel(0(100)400, axis(1))

*and now we tab the variables (for the information in the text):
tabstat pop_urban_ths dist_next dist_previous time_as_endpoint if riopreto==0, s(mean sd max min count)
tabstat pop_urban_ths dist_next dist_previous time_as_endpoint if riopreto==1, s(mean)

restore

********************************************************************************
*RESULTS FROM PREVIOUS CENSUS YEARS (TABLE A2)
********************************************************************************

preserve

rename pop_urban pop_urban2010
keep cod_municipio time_as_endpoint found_year year dist_to_luz mun_area endpoint pop_urban2000 pop_urban2010 dist_next dist_previous xcoord ycoord

*import the datasets:
gen amc_year = 2000
rename cod_municipio amc

	*1991:
	append using "$workingdata/amc1991_data.dta"
	replace amc_year=1991 if amc_year==.
	*1970:
	append using "$workingdata/amc1970_data.dta"
	replace amc_year=1970 if amc_year==.
	
	
*prepare dependent variables:
foreach y in 1970 1980 1991 2000 2010 {
gen lpop_urban`y' = log(pop_urban`y')
}

*now we generate the control variables:

	*preexistent variable:
	gen preexistent_mun = 0
	replace preexistent_mun = 1 if found_year<=year & year!=. & found_year!=.
	la var preexistent_mun "Incorporated before railroad arrival"
	
	*log distance to luz:
	gen ldist_to_luz = log(dist_to_luz)
	
*now we select the sample:
drop if endpoint==1
drop if dist_to_luz==0

*now we estimate the regressions (excluding if still endpoint):
gen aux = year+time_as_endpoint
foreach y in 1970 1980 {
	di "YEAR = `y'"
acreg lpop_urban`y' time_as_endpoint $controls if aux<=`y' & aux!=. & amc_year==1970, spatial latitude(ycoord) longitude(xcoord) dist(50)
}
foreach y in 1991 {
	di "YEAR = `y'"
acreg lpop_urban`y' time_as_endpoint $controls if aux<=`y' & aux!=. & amc_year==1991 , spatial latitude(ycoord) longitude(xcoord) dist(50)
}
foreach y in 2000 2010 {
	di "YEAR = `y'"
acreg lpop_urban`y' time_as_endpoint $controls if aux<=`y' & aux!=. & amc_year==2000 , spatial latitude(ycoord) longitude(xcoord) dist(50)
}
	
**********************************************
*STATISTICS ON RELATIVE CITY SIZES (TABLE A3)*
**********************************************	


*summary statistics by year:
keep if amc_year==1970
sum pop_urban1970 pop_urban1980 pop_urban1991 pop_urban2000 pop_urban2010

*compare urbanization rates:
foreach y in 1970 1980 1991 2000 2010 {
gen urb_rate`y' = pop_urban`y'/pop`y'
}
sum urb_rate1970 urb_rate1980 urb_rate1991 urb_rate2000 urb_rate2010

*get log-log and rank correlation:
foreach y in 1970 1980 1991 2000 2010 {
di "Year=`y'"
reg lpop_urban`y' lpop_urban1970
spearman pop_urban`y' pop_urban1970
}
	
restore

********************************************************************************
*ROBUSTNESS TO ALTERNATIVE INFERENCE (TABLE A11)
********************************************************************************

*create matrix first:
mat inference= J(8,5,.)

*estimate Conley standard errors for the OLS:
local i=1
foreach dt in 1 25 50 75 100 {

local i=`i'+1

local j=1
mat inference[`i',`j'] = `dt'

di "*** DISTANCE: `dt' ****"
foreach y in $LHS {
local j=`j'+1
acreg `y' time_as_endpoint $controls, spatial latitude(ycoord) longitude(xcoord) dist(`dt')
	mat b=e(b)
	mat V=e(V)
	mat inference[1,`j']=b[1,1]
	mat inference[`i',`j']=sqrt(V[1,1])
}			  

}	

	*now do after clustering by Regiao Imediata:
	local i=`i'+1
	local j=1
	foreach y in $LHS {
		local j=`j'+1
	reg `y' time_as_endpoint $controls , vce(cluster cod_rgi)
		mat b=e(b)
		mat V=e(V)
		mat inference[1,`j']=b[1,1]
		mat inference[`i',`j']=sqrt(V[1,1])
	boottest time_as_endpoint=0,  boott(wild) seed(03548453) reps(10000) nograph
	}
	
	*now do after clustering by Regiao Intermediaria:
	local i=`i'+1
	local j=1
	foreach y in $LHS {
		local j=`j'+1
	reg `y' time_as_endpoint $controls , vce(cluster cod_rgint)
		mat b=e(b)
		mat V=e(V)
		mat inference[1,`j']=b[1,1]
		mat inference[`i',`j']=sqrt(V[1,1])
	boottest time_as_endpoint=0,  boott(wild) seed(03548453) reps(10000) nograph
	}	
	
mat li inference	

********************************************************************************
*IV RESULTS
********************************************************************************

*generate the IV:
gen aux = dist_nextfound
replace aux =  154.2545 if dist_nextfound==.
tabstat dist_nextfound, s(count mean p10 p25 median p75 p90 max)
	gen iv = log(aux)
	
*generate the placebo IV:	
drop aux
tabstat dist_prfound, s(count mean p10 p25 median p75 p90 max)
gen aux = dist_prfound
replace aux = 209.1758 if dist_prfound==.
	gen iv_placebo = log(aux)	
	
*generate minimum log distance to either up or down:
egen min_log_distance = rowmin(dist_nextfound dist_prfound)
	tabstat min_log_distance, s(count mean p10 p25 median p75 p90 max)
	replace min_log_distance = 209.1758 if min_log_distance==.
	replace min_log_distance = log(min_log_distance)
	
*la var:
la var iv "Log distance to next incorporated mun."
la var min_log_distance "Log distance to closest incorporated mun."	
la var iv_placebo "Log distance to previous incorporated mun."

********************************************************************************
*MAIN IV RESULTS (PANEL A OF TABLE 3) AND REDUCED FORM (PANEL A OF TABLE A.12) *
********************************************************************************
	
*first stage:	
acreg time_as_endpoint iv $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)

*reduced form:
foreach y in $LHS {
acreg `y' iv $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}
	
*2SLS:
foreach y in $LHS {
acreg `y' $controls (time_as_endpoint = iv), spatial latitude(ycoord) longitude(xcoord) dist(50)
}
	
*find AR statistics:
foreach y in $LHS {
gen LHS = `y'- 0*time_as_endpoint
acreg LHS iv $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
test iv=0
drop LHS	
}
	
*****************************************************************************************
*CONTROL FOR FUNDAMENTALS (PANEL B OF TABLE 3) AND REDUCED FORM (PANEL B OF TABLE A.12) *
*****************************************************************************************

*first stage:
acreg time_as_endpoint iv  $controls $coord_controls $fund_controls , spatial latitude(ycoord) longitude(xcoord) dist(50)

*reduced form:
foreach y in $LHS {
acreg `y' iv $controls $coord_controls $fund_controls , spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*2SLS:
foreach y in $LHS {	
acreg `y' $controls $coord_controls $fund_controls  (time_as_endpoint = iv) , spatial latitude(ycoord) longitude(xcoord) dist(50)
}
	
*find AR statistics:
foreach y in $LHS {
gen LHS = `y'- 0*time_as_endpoint
acreg LHS iv $controls $coord_controls $fund_controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
test iv=0
drop LHS	
}

************************************
*CONTROL FOR REMOTENESS (TABLE A16)*
************************************

*first stage:
acreg time_as_endpoint iv $controls min_log_distance, spatial latitude(ycoord) longitude(xcoord) dist(50)

*reduced form:
foreach y in $LHS {
acreg `y' iv $controls min_log_distance, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*2SLS:
foreach y in $LHS {
acreg `y' $controls min_log_distance (time_as_endpoint = iv) , spatial latitude(ycoord) longitude(xcoord) dist(50)
}
	
************************
*PLACEBO IV (TABLE A15)*
************************

*first stage:
acreg time_as_endpoint iv_placebo  $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)	

*reduced form:
foreach y in $LHS {
acreg `y' iv_placebo $controls , spatial latitude(ycoord) longitude(xcoord) dist(50)
}
	
************************************************************************
*CHANGES IN THE IV: ADD A DISTANCE AT THE TOP (PANELS A,B OF TABLE A17)*
************************************************************************

preserve
foreach dt in 50 100 {

replace iv = log(154.2545+`dt') if dist_nextfound==.

acreg time_as_endpoint iv  $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)

foreach y in $LHS {
acreg `y' $controls (time_as_endpoint = iv), spatial latitude(ycoord) longitude(xcoord) dist(50)
}

}
restore

******************************************
*FLEXIBLE APPROACH (PANEL C OF TABLE A17)*
******************************************

gen iv_dummy = 0
replace iv_dummy = 1 if dist_nextfound==.
gen iv_im = 0
replace iv_im = iv if dist_nextfound!=.

acreg time_as_endpoint iv_dummy iv_im  $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
foreach y in $LHS {
acreg `y' $controls (time_as_endpoint = iv_dummy iv_im), spatial latitude(ycoord) longitude(xcoord) dist(50)
}
	
****************************************
*USE 10-YEAR LAG (PANEL D OF TABLE A17)*
****************************************

*create the IV:
drop aux
gen aux = dist_nextfound_10yr
sum aux
replace aux =  132.837 if dist_nextfound_10yr==.
gen iv_lag10 = log(aux)

*label variable:
la var iv_lag10 "Log distance to next incorporated mun. (10-year lag)"

*first stage:	
acreg time_as_endpoint iv_lag10 $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
	
*2SLS:
foreach y in $LHS {
acreg `y' $controls (time_as_endpoint = iv_lag10), spatial latitude(ycoord) longitude(xcoord) dist(50)
}
	
***********************************************************************
*CENSOR IV (TABLE A13 AND PANEL C OF TABLE 3)*	
***********************************************************************

*generate the censored IV:
foreach dt in 10 20 30 40 50 {
gen iv_m`dt'=log(`dt')
replace iv_m`dt'=iv if dist_nextfound>`dt'
}

*regressions:
foreach dt in 10 20 30 40 50 {

*first stage:
acreg time_as_endpoint iv_m`dt' $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)

*reduced form:
foreach y in $LHS {
acreg `y' iv_m`dt' $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*IV results:
foreach y in $LHS {
acreg `y' $controls (time_as_endpoint = iv_m`dt'), spatial latitude(ycoord) longitude(xcoord) dist(50)
}

}
	
*find AR statistics for the 20 km case:
foreach y in $LHS {
gen LHS = `y'- 0*time_as_endpoint
acreg LHS iv_m20 $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
test iv_m20=0
drop LHS	
}
	
************************************************************
*USE ONLY WHETHER NEXT NOT ALREADY INCORPORATED (TABLE A14)*
************************************************************

*first stage:
acreg time_as_endpoint iv  $controls if dist_nextfound!=dist_next, spatial latitude(ycoord) longitude(xcoord) dist(50)

*2sls:
foreach y in $LHS {
acreg `y' year ldist_to_luz (time_as_endpoint = iv) if dist_nextfound!=dist_next, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*reduced form:
foreach y in $LHS {
acreg `y' iv $controls  if dist_nextfound!=dist_next, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*find AR test:
foreach y in $LHS {
gen LHS = `y'- 0*time_as_endpoint
acreg LHS iv $controls if dist_nextfound!=dist_next, spatial latitude(ycoord) longitude(xcoord) dist(50)
test iv=0
drop LHS	
}
	
*************************************
*GRAPHS OF IV VARIATION (FIGURE A5) *
*************************************

*get the residuals:
foreach v in iv time_as_endpoint lpop_urban lgdp_urban ldist_next ldist_previous {
reg `v' $controls
predict `v'_res, residuals
}

*first stage graph:
graph twoway (scatter time_as_endpoint_res iv_res, msymbol(o) mcolor(gray)) ///
(lfit time_as_endpoint_res iv_res, lcolor(dknavy)) ///
, ytitle("Years as Endpoint (res.)") xtitle("IV (res.)")  plotr(style(none)) ///
legend(off)

*urban population:
graph twoway (scatter lpop_urban_res iv_res, msymbol(o) mcolor(gray)) ///
(lfit lpop_urban_res iv_res, lcolor(dknavy)) ///
, ytitle("Log Urban Population (res.)") xtitle("IV (res.)")  plotr(style(none)) ///
legend(off)

*urban GDP:
graph twoway (scatter lgdp_urban_res iv_res, msymbol(o) mcolor(gray)) ///
(lfit lgdp_urban_res iv_res, lcolor(dknavy)) ///
, ytitle("Log Urban GDP (res.)") xtitle("IV (res.)")   plotr(style(none)) ///
legend(off)

*distance to next town:
graph twoway (scatter ldist_next_res iv_res, msymbol(o) mcolor(gray)) ///
(lfit ldist_next_res iv_res, lcolor(dknavy)) ///
, ytitle("Log Dist. Next Mun. (res.)") xtitle("IV (res.)")   plotr(style(none)) ///
legend(off)

*and distance to previous town:	
graph twoway (scatter ldist_previous_res iv_res, msymbol(o) mcolor(gray)) ///
(lfit ldist_previous_res iv_res, lcolor(dknavy)) ///
, ytitle("Log Dist. Previous Mun. (res.)") xtitle("IV (res.)")   plotr(style(none)) ///
legend(off)

********************************************************************************
*PLAUSIBLE EXOGENOUS APPROACH (UCI)  (FIGURE A6)                                          
********************************************************************************	

********************
*FOR LOG URBAN POP *
********************

*set maximum (reduced form), minimum (zero), and UCI max (placebo) values:
local gammamax = 0.2518277
local gammaplacebo = 0.0595936
local gammamin = 0

*set the matrix:
mat uci= J(301,3,.)

*run the loop:
forvalues i=0/300 {

	*set the gamma:
	if `i'<150 {
	local gammavalue = `gammamin'+(`i'/150)*(`gammaplacebo'-`gammamin')
	}
	else {
	local gammavalue = `gammaplacebo'+((`i'-150)/150)*(`gammamax'-`gammaplacebo')
	}
	di "*** Value of gamma: `gammavalue' ***"
	mat uci[`i'+1,1] = `gammavalue'
	
	*run the regression and store the estimates:
	gen aux_y = lpop_urban - `gammavalue'*iv
	quietly acreg aux_y $controls (time_as_endpoint = iv), spatial latitude(ycoord) longitude(xcoord) dist(50)
	mat b=e(b)
	mat uci[`i'+1,2] = b[1,1]
	mat V=e(V)
	mat uci[`i'+1,3] = V[1,1]
	drop aux_y
}

*make graphs and calculate interval:
preserve
clear
svmat uci

	*rename:
	rename uci1 gamma_value
	rename uci2 coeff
	rename uci3 variance
	gen sd = sqrt(variance)
	
	*generate confidence intervals:
	gen CI95_ub = coeff+1.96*sd
	gen CI95_lb = coeff-1.96*sd
	gen CI90_ub = coeff+1.64*sd
	gen CI90_lb = coeff-1.64*sd
	
	*generate UCI using the placebo as maximum:
	foreach c in 95 90 {
	gen aux = CI`c'_ub if gamma_value<=`gammaplacebo'
	egen UCI`c'_ub = max(aux) if gamma_value<=`gammaplacebo'
	drop aux
	gen aux = CI`c'_lb if gamma_value<=`gammaplacebo'
	egen UCI`c'_lb = min(aux) if gamma_value<=`gammaplacebo'
	drop aux
	}
	sum UCI90_* UCI95_*
	sum gamma_value if CI95_lb>0
	sum gamma_value if CI90_lb>0
	
	*graph:
	twoway (line coeff gamma_value, lp(solid) lcolor(black)) ///
	(line CI95_ub gamma_value, lp(dash) lcolor(black)) (line CI95_lb gamma_value, lp(dash) lcolor(black)) ///
	(line CI90_ub gamma_value, lp(dot) lcolor(black)) (line CI90_lb gamma_value, lp(dot) lcolor(black)) ///
	(line UCI95_ub gamma_value, lp(solid) lwidth(medthick) lcolor(navy)) (line UCI95_lb gamma_value, lp(solid) lwidth(medthick) lcolor(navy)), ///
	xscale(range(0 0.255)) xlabel(0(0.03)0.24) yscale(range(-0.21 0.52)) ylabel(-0.2(0.1)0.5) ytitle("Beta") xtitle("Gamma") /// xmlabel(0.060, labcolor(red) labsize(small))
	legend(order(1 2 4 6) col(1)  label(1 "2SLS coefficient for each gamma") label(2 "95% CI for each gamma") label(4 "90% CI for each gamma") ///
	label(6 "95% UCI under placebo as highest gamma") ) ///
	yline(0, lstyle(foreground)) yline(0, lcolor(gray) lw(thin)) xline(`gammaplacebo', lcolor(gray) lw(thin)) plotr(style(none)) plotregion(margin(0))
	

restore

********************
*FOR LOG URBAN GDP *
********************

*set maximum (reduced form), minimum (zero), and UCI max (placebo) values:
local gammamax = .2437212
local gammaplacebo = .0700435
local gammamin = 0

*set the matrix:
mat uci= J(301,3,.)

*run the loop:
forvalues i=0/300 {

	*set the gamma:
	if `i'<150 {
	local gammavalue = `gammamin'+(`i'/150)*(`gammaplacebo'-`gammamin')
	}
	else {
	local gammavalue = `gammaplacebo'+((`i'-150)/150)*(`gammamax'-`gammaplacebo')
	}
	di "*** Value of gamma: `gammavalue' ***"
	mat uci[`i'+1,1] = `gammavalue'
	
	*run the regression and store the estimates:
	gen aux_y = lgdp_urban - `gammavalue'*iv
	quietly acreg aux_y $controls (time_as_endpoint = iv), spatial latitude(ycoord) longitude(xcoord) dist(50)
	mat b=e(b)
	mat uci[`i'+1,2] = b[1,1]
	mat V=e(V)
	mat uci[`i'+1,3] = V[1,1]
	drop aux_y
}

*make graphs and calculate interval:
preserve
clear
svmat uci

	*rename:
	rename uci1 gamma_value
	rename uci2 coeff
	rename uci3 variance
	gen sd = sqrt(variance)
	
	*generate confidence intervals:
	gen CI95_ub = coeff+1.96*sd
	gen CI95_lb = coeff-1.96*sd
	gen CI90_ub = coeff+1.64*sd
	gen CI90_lb = coeff-1.64*sd
	
	*generate UCI using the placebo as maximum:
	foreach c in 95 90 {
	gen aux = CI`c'_ub if gamma_value<=`gammaplacebo'
	egen UCI`c'_ub = max(aux) if gamma_value<=`gammaplacebo'
	drop aux
	gen aux = CI`c'_lb if gamma_value<=`gammaplacebo'
	egen UCI`c'_lb = min(aux) if gamma_value<=`gammaplacebo'
	drop aux
	}
	sum UCI90_* UCI95_*
	sum gamma_value if CI95_lb>0
	sum gamma_value if CI90_lb>0
	
	*graph:
	twoway (line coeff gamma_value, lp(solid) lcolor(black)) ///
	(line CI95_ub gamma_value, lp(dash) lcolor(black)) (line CI95_lb gamma_value, lp(dash) lcolor(black)) ///
	(line CI90_ub gamma_value, lp(dot) lcolor(black)) (line CI90_lb gamma_value, lp(dot) lcolor(black)) ///
	(line UCI95_ub gamma_value, lp(solid) lwidth(medthick) lcolor(navy)) (line UCI95_lb gamma_value, lp(solid) lwidth(medthick) lcolor(navy)), ///
	xscale(range(0 0.255)) xlabel(0(0.03)0.24) yscale(range(-0.21 0.52)) ylabel(-0.2(0.1)0.5) ytitle("Beta") xtitle("Gamma") /// xmlabel(0.060, labcolor(red) labsize(small))
	legend(off) /// 
	yline(0, lstyle(foreground)) yline(0, lcolor(gray) lw(thin)) xline(`gammaplacebo', lcolor(gray) lw(thin)) plotr(style(none)) plotregion(margin(0))

restore

********************
*FOR LOG DIST NEXT *
********************

*set maximum (reduced form), minimum (zero), and UCI max (placebo) values:
local gammamax = .1363513
local gammaplacebo = .0521607
local gammamin = 0

*set the matrix:
mat uci= J(301,3,.)

*run the loop:
forvalues i=0/300 {

	*set the gamma:
	if `i'<150 {
	local gammavalue = `gammamin'+(`i'/150)*(`gammaplacebo'-`gammamin')
	}
	else {
	local gammavalue = `gammaplacebo'+((`i'-150)/150)*(`gammamax'-`gammaplacebo')
	}
	di "*** Value of gamma: `gammavalue' ***"
	mat uci[`i'+1,1] = `gammavalue'
	
	*run the regression and store the estimates:
	gen aux_y = ldist_next - `gammavalue'*iv
	quietly acreg aux_y $controls (time_as_endpoint = iv), spatial latitude(ycoord) longitude(xcoord) dist(50)
	mat b=e(b)
	mat uci[`i'+1,2] = b[1,1]
	mat V=e(V)
	mat uci[`i'+1,3] = V[1,1]
	drop aux_y
}

*make graphs and calculate interval:
preserve
clear
svmat uci

	*rename:
	rename uci1 gamma_value
	rename uci2 coeff
	rename uci3 variance
	gen sd = sqrt(variance)
	
	*generate confidence intervals:
	gen CI95_ub = coeff+1.96*sd
	gen CI95_lb = coeff-1.96*sd
	gen CI90_ub = coeff+1.64*sd
	gen CI90_lb = coeff-1.64*sd
	
	*generate UCI using the placebo as maximum:
	foreach c in 95 90 {
	gen aux = CI`c'_ub if gamma_value<=`gammaplacebo'
	egen UCI`c'_ub = max(aux) if gamma_value<=`gammaplacebo'
	drop aux
	gen aux = CI`c'_lb if gamma_value<=`gammaplacebo'
	egen UCI`c'_lb = min(aux) if gamma_value<=`gammaplacebo'
	drop aux
	}
	sum UCI90_* UCI95_*
	sum gamma_value if CI95_lb>0
	sum gamma_value if CI90_lb>0
	
	*graph:
	twoway (line coeff gamma_value, lp(solid) lcolor(black)) ///
	(line CI95_ub gamma_value, lp(dash) lcolor(black)) (line CI95_lb gamma_value, lp(dash) lcolor(black)) ///
	(line CI90_ub gamma_value, lp(dot) lcolor(black)) (line CI90_lb gamma_value, lp(dot) lcolor(black)) ///
	(line UCI95_ub gamma_value, lp(solid) lwidth(medthick) lcolor(navy)) (line UCI95_lb gamma_value, lp(solid) lwidth(medthick) lcolor(navy)), ///
	xscale(range(0 0.13)) xlabel(0(0.02)0.12) yscale(range(-0.21 0.32)) ylabel(-0.2(0.1)0.3) ytitle("Beta") xtitle("Gamma") /// xmlabel(0.060, labcolor(red) labsize(small))
	legend(off) ///
	yline(0, lstyle(foreground)) yline(0, lcolor(gray) lw(thin)) xline(`gammaplacebo', lcolor(gray) lw(thin)) plotr(style(none)) plotregion(margin(0))

restore
	
********************************************************************************
*EFFECTS ON PER CAPITA URBAN GDP (TABLE A21)
********************************************************************************

*generate variable:
gen pcgdp_urban = (gdp_urban/pop_urban)
gen lpcgdp_urban = lgdp_urban - lpop_urban
sum pcgdp_urban

*run the OLS results:
foreach y in lpcgdp_urban pcgdp_urban {
acreg `y' time_as_endpoint $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*run the IV results:
foreach y in lpcgdp_urban pcgdp_urban {
acreg `y' $controls (time_as_endpoint = iv), spatial latitude(ycoord) longitude(xcoord) dist(50)
}

********************************************************************************
*EFFECTS ON CITY HIERARCHY (TABLE A22)
********************************************************************************

*merge:
merge 1:1 cod_municipio_6dt using "$workingdata/sectors2010.dta"
keep if _m==3	
drop _m

	*get summary statistics:
	sum nr_*

*run the OLS results:
foreach c in classe subclasse {
acreg nr_`c' time_as_endpoint $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*run the IV results:
foreach c in classe subclasse {
acreg nr_`c' $controls (time_as_endpoint = iv), spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*run the OLS results with serial correlation:
foreach c in classe subclasse {
acreg nr_`c' time_as_endpoint time_as_endpoint_previous $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}
	
********************************************************************************
*FRONTIER VARIABLE: BOLSONARO VOTE (TABLE A23)
********************************************************************************

*merge:
merge 1:1 cod_municipio using "$workingdata/frontier_vars.dta"
keep if _m==3
drop _m

*OLS:
foreach r in 1 2  {
acreg sh_bolsonaro_round`r' time_as_endpoint $controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}

*2SLS:
foreach r in 1 2  {
acreg sh_bolsonaro_round`r' $controls (time_as_endpoint = iv) , spatial latitude(ycoord) longitude(xcoord) dist(50)
}	

********************************************************************************
*COMPARISON BETWEEN ACCESS AND TIME AS ENDPOINT (TABLE A18)
********************************************************************************

*import extended dataset:
use "$workingdata/mun_data_neighboring.dta", clear

*generate the dependent variables:
	gen lpop_urban = log(pop_urban)
	gen lgdp_urban = log(man_va+serv_va+gov_va)
	
*generate the time as endpoint given access:
gen time_as_endpoint_rail = time_as_endpoint if rail_access==1
replace time_as_endpoint_rail = 0 if rail_access==0

*labeling:	
la var rail_access "Railroad access"
la var time_as_endpoint_rail "Time as endpoint (conditional on access)"

*prepare controls:
foreach c in coffee maize sugar malaria {
	gen l`c'_mean = log(`c'_mean)
}
gen alt_mean_sq = alt_mean^2
gen tri_mean_sq = tri_mean^2
gen xcoord_sq = xcoord^2
gen ycoord_sq = ycoord^2
gen xcoordXycoord = xcoord*ycoord

*regression with no controls:
foreach y in lpop_urban lgdp_urban {
di "Simple comparison wrt to access"
acreg `y' rail_access, spatial latitude(ycoord) longitude(xcoord) dist(50)
di "Both variables"
acreg `y' rail_access time_as_endpoint_rail, spatial latitude(ycoord) longitude(xcoord) dist(50)
}		
	
*regression with controls:
foreach y in lpop_urban lgdp_urban {
di "Simple comparison wrt to access"
acreg `y' rail_access $coord_controls $fund_controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
di "Both variables"
acreg `y' rail_access time_as_endpoint_rail $coord_controls $fund_controls, spatial latitude(ycoord) longitude(xcoord) dist(50)
}	
